In real-world data analysis, your data will likely:
Fortunately, pandas can help you with all of this!
import pandas as pd
full_url = 'http://personal.tcu.edu/kylewalker/data/colleges.csv'
full = pd.read_csv(full_url, encoding = 'latin_1')
full.shapecols_to_keep = ['INSTNM', 'STABBR', 'GRAD_DEBT_MDN_SUPP']
debt = full[cols_to_keep]
debt.columns = ['name', 'state', 'debt']
debt.head()debt[0:10].ix[] method (note the brackets)ex1 = debt.set_index('name')
ex1.ix['Amridge University':'Alabama State University']& (and) and | (or).isin() method: checks to see if value is in list of valuesdebt1 = debt[debt['debt'] != 'PrivacySuppressed']
debt1.head()tx_debt = debt[(debt['debt'] != 'PrivacySuppressed') & (debt['state'] == 'TX')]
# Alternatively, use the .query() method
# tx_debt = debt.query('debt != "PrivacySuppressed" & state == "TX" ')
tx_debt.head()states = ['OK', 'NM', 'TX', 'LA']
sw_debt = debt[(debt['debt'] != 'PrivacySuppressed') & (debt['state'].isin(states))]
sw_debt.head().assign() methodimport numpy as np
np.random.seed(1983)
df1 = pd.DataFrame({'col1': np.random.randint(1, 100, 10),
'col2': np.random.randint(1, 100, 10),
'col3': np.random.randint(1, 100, 10)})# Must use index-based labeling for this
df1['col4'] = df1['col1'] + df1['col2']
df2 = df1.assign(col5 = df1['col3'] / df1['col4'])
df2.head()dtype conversion.astype() methodSettingWithCopyWarningsw_debt['debtnum'] = sw_debt.debt.astype(float)
sw_debt.head().dropna() method: delete all rows (or columns) that have any missing values (NaN in pandas)sw2 = sw_debt.dropna()
sw2.head().fillna() method: fill in missing data with a specified valuesw3 = sw_debt.fillna(sw_debt.median())
sw3.head()pandas: .groupby() method!Process:
.groupby() in pandassw_grouped = sw2.groupby('state')
sw_grouped['debtnum'].mean()
# Result
state
LA 15876.255319
NM 16237.466667
OK 17030.860759
TX 15009.426582seabornimport seaborn as sns
sns.boxplot(x = 'state', y = 'debtnum', data = sw2, orient = 'v')seaborng = sns.FacetGrid(data = sw2, col = 'state', col_wrap = 2)
g.map(sns.distplot, 'debtnum')seabornfor s in sw2.state.unique():
data = sw2[sw2.state == s]
sns.kdeplot(data.debtnum, shade = True, label = s)np.random.seed(123456)
m1 = pd.DataFrame({'type': ['a', 'b', 'c', 'd', 'e', 'f'],
'ind1': np.random.randint(1, 100, 6),
'ind2': np.random.randint(1, 100, 6)})
m2 = pd.DataFrame({'type': ['a', 'b', 'c', 'd', 'e', 'f'],
'ind3': np.random.randint(1, 100, 6),
'ind4': np.random.randint(1, 100, 6)}).merge() method in pandasm3 = m1.merge(m2, on = 'type')pandashow parameter): 'inner' (default), 'left', 'right', and 'outer'm4 = pd.DataFrame({'type': ['d', 'e', 'f', 'g', 'h', 'i'],
'ind5': np.random.randint(1, 100, 6),
'ind6': np.random.randint(1, 100, 6)})m5 = m1.merge(m4, on = 'type', how = 'inner')m5 = m1.merge(m4, on = 'type', how = 'left')m5 = m1.merge(m4, on = 'type', how = 'right')m5 = m1.merge(m4, on = 'type', how = 'outer')from pandas_datareader import wb
countries = ['ZA', 'BR', 'US']
urban = wb.download(indicator = 'SP.URB.TOTL.IN.ZS',
country = countries, start = 1960,
end = 2011).reset_index()
urban.head().pivot() method in pandasurban_wide = urban.pivot(index = 'year', columns = 'country',
values = 'SP.URB.TOTL.IN.ZS')
urban_wide.head()urban_wide.plot()pd.melt() function in pandasurban_long = pd.melt(urban_wide.reset_index(), id_vars = 'year',
var_name = 'country', value_name = 'SP.URB.TOTL.IN.ZS')
urban_long.head()